# Build the industry-to-O*NET crosswalk for the heterogeneity analysis.
# Maps each entrepreneur's SIC industry code to O*NET work context measures
# (e.g., frequency of face-to-face discussions).
#
# This is step 2 of the industry-to-O*NET crosswalk (step 1 is
# match_industry_to_occ.py, which produces industry_with_matches.csv).
#
# Uses:
#   data/admin/external/entrepreneur_industries.csv   List of SIC codes in the sample
#   data/admin/external/industry_with_matches.csv     SIC-to-occupation matches (from step 1)
#   data/onet/Occupation Data.txt                O*NET occupation codes/titles
#   data/onet/Work Context.txt                   O*NET work context measures
#
# Creates:
#   data/admin/external/entrepreneur_industries_to_SOC.csv    SIC -> SOC code mapping
#   data/admin/external/entrepreneur_industries_to_ONET.csv   SIC -> SOC -> O*NET work context

library(data.table)

project_root <- file.path(dirname(sys.frame(1)$ofile), "..", "..", "..")
onet_path <- file.path(project_root, "data", "onet")

# --- load entrepreneur industries ---
dt_industries <- fread(file.path(project_root, "data", "admin", "external", "entrepreneur_industries.csv"))
dt_industries <- dt_industries[!is.na(sic_code)]
dt_industries[, sic_code_orig := sic_code]

# --- load semantic matches (from step 1) ---
dt_matches <- fread(file.path(project_root, "data", "admin", "external", "industry_with_matches.csv"))
dt_matches[, sic_code := as.numeric(code)]
dt_matches <- unique(dt_matches[, .(sic_code, name, best_match, source)])
dt_matches[, byear_grp := ifelse(source == "old", 2008, 2009)]

# --- merge on SOC codes from O*NET ---
dt_soc <- fread(file.path(onet_path, "Occupation Data.txt"))
dt_soc <- dt_soc[, .(soc_code = `O*NET-SOC Code`, best_match = Title)]
dt_matches <- merge(dt_matches, dt_soc, by = "best_match")

# --- load O*NET work context data ---
dt_onet <- fread(file.path(onet_path, "Work Context.txt"))
dt_onet <- dt_onet[
  `Element Name` %in% c("Face-to-Face Discussions",
                         "Contact With Others",
                         "Deal With External Customers",
                         "Work With Work Group or Team",
                         "Public Speaking"),
  .(soc_code = `O*NET-SOC Code`,
    element = `Element Name`,
    category = Category,
    value = `Data Value`)
]
dt_onet <- dt_onet[category != "n/a"]
dt_onet <- dt_onet[, .(daily = value[category == 5],
                        weekly = value[category == 5] + value[category == 4]),
                   by = .(soc_code, element)]

dt_matches <- dt_matches[soc_code %in% unique(dt_onet$soc_code)]

# --- handle unmatched industries by moving to coarser SIC levels ---
no_match <- setdiff(unique(dt_industries$sic_code), unique(dt_matches$sic_code))
dt_industries[sic_code %in% no_match, sic_code := floor(sic_code * 10) / 10]

no_match2 <- setdiff(unique(dt_industries$sic_code), unique(dt_matches$sic_code))
dt_industries[sic_code %in% no_match2, sic_code := floor(sic_code)]

# --- merge industries to matches ---
dt_merged <- merge(dt_industries, dt_matches, by = c("sic_code", "byear_grp"), all.x = TRUE)

# try the other SIC vintage for any remaining unmatched
dt_unmatched <- dt_merged[is.na(best_match), .(sic_code, sic_code_orig, byear_grp)]
dt_unmatched[, byear_grp := ifelse(byear_grp == 2008, 2009, 2008)]
dt_merged2 <- merge(dt_unmatched, dt_matches, by = c("sic_code", "byear_grp"), all.x = TRUE)
dt_merged2[, byear_grp := ifelse(byear_grp == 2008, 2009, 2008)]

dt_merged <- rbind(dt_merged[!is.na(best_match)], dt_merged2)
dt_merged <- dt_merged[, .(sic_code = sic_code_orig, byear_grp, name,
                           title_matched = best_match, soc_code)]

fwrite(dt_merged, file.path(project_root, "data", "admin", "external", "entrepreneur_industries_to_SOC.csv"))

# --- join with O*NET work context ---
dt_lmnt <- merge(dt_merged, dt_onet, by = "soc_code", all.x = TRUE)
fwrite(dt_lmnt, file.path(project_root, "data", "admin", "external", "entrepreneur_industries_to_ONET.csv"))
